
**************************************************************************************************************************************************
******************* Code for 'Doing Less with Less: Capital Misallocation, investment and the prodcutivty slowdown in Australia *******************
***************************************************** Code for Data manipulation ******************************************************************

**************************************************DATE: AUG 2022 ******************************************************************************
**************************************************** AUTHOR: Jonathan Hambur *******************************************************************
*********************************************************************************************************************************************

*** Code strcuture
*00. Preliminaries and globals
*01. Intiail data clean
*02 Merge outside data
*03 Variable constrcution


**** Imported data series
* m_DLW_tl_m_w_y estiamted firm mark-up. TL indicates translog (blacnk CD), M indicates based internediate input elasticity. W and y indicae used wages and year FE in first stage, respectively. See Hambur (2021)
* prod_DLW_tl_w_y log MFP. From above estiamtion
* K bit




****** 00. Preliminaries - listing file data


* Setting for choosing input file and markup estiamtion - based markup esitmation defs
local ind industry
local file _bit
local output income
local input variable2
local files `file'm
** Current Current (0) or lagged book value of K stock, or perp inv est (2)
local k_meas = 1

* Define the instrument sets
local k_instr k

**** Trim extreme growth? no (0), if large share of industry change (1), if in outlier in distry (2), based multiprod rules
local trim = 0

local drops 1

** Current Current (0) or lagged book value of K stock, or perp inv est (2)
local k_meas = 1

* Define the instrument sets
local k_instr k

**** Trim extreme growth? no (0), if large share of industry change (1), if in outlier in distry (2), based multiprod rules
local trim = 0

local drops 1

local cuts 1

** Markup/MFP measure to use measure to use - can change to non-_tl version for Cobb-Douglas
local reg m_DLW_tl_m_w_y
global reg m_DLW_tl_m_w_y
global prod prod_DLW_tl_w_y




*** 01 Intiial data cleaning
  
****** Load in file
clear
use "$outputs\mark_k_`k_meas'_instr_`k_instr'_out_`output'_int_`input'_trm_`trim'_`ind'_`file'_l`cuts'.dta" , clear


  local list_mark  $reg  
  drop if $reg <0
  preserve

*** Make industry measures
foreach i in `list_mark' {
  xtset firmid fyear
    *** get somne trimming variables ready
   
 egen `i'1 =pctile(`i'),p(1)
 egen `i'6 =pctile(`i'),p(99)
 ** Do very basic trim, according to trim=0
 drop if `i'< `i'1
 drop if `i'> `i'6
 * Industry mean
 egen mi_`i' = mean(`i'), by(fyear industry)

 collapse(mean) mi_`i'=`i', by(`ind' fyear)
	save "$outputs\sum_k_`k_meas'_instr_`k_instr'_out_`output'_int_`input'_trm_`trim'_`ind'_`file'_l`cuts'.dta", replace

	** Do simple comparison of markups by fin dep and mak mark-up change metric
	
eststo clear 
xtset industry fyear
eststo: xtreg mi_$reg i.fyear if division!="A" & division !="B" & division !="D" & q_FINDEPRZ==4, fe
eststo: xtreg mi_$reg i.fyear if division!="A" & division !="B" & division !="D" & q_FINDEPRZ!=4, fe
	esttab using "$regs\figure_b1.csv", star(* 0.10 ** 0.05 *** 0.01)  stats(N r2 ar2) se replace mtitle("fin intense"  "non fin intense")
	eststo  clear

	keep if fyear == 2004 | fyear == 2017
	bysort industry (fyear):  g change_mi = mi_$reg -mi_{$reg}[_n-1]
	keep fyear change_mi industry
	keep if fyear == 2017
	xtile mu_change_q = change_mi, nq(4)
	
	save "$outputs\industry_mu_change", replace
restore
}


*** Dataset for Markup estiamtin kept skinny, so bring in some extra varaibles here, as well as some industry metrics
** If need extra demog data not used can bring in here
merge 1:1 fyear id using "$outputs\frame.dta", keepusing(x_state) keep(master match)
drop _merge

** State unemployment data from ABS LFS  - csv included in sup material
rename x_state State
merge m:1 fyear State using "$data\Unemp.dta",  keep(master match)
drop _merge

** If need extra bal sheet data not used can bring in here. Taken from BIT - not used
*merge 1:1 fyear id using "$outputs\extra_bs.dta", keepusing(ncl ncl_ currliab totlliab) keep(master match)
*drop _merge
** Bring in dlgo metric, as skinny verions loses changees given only keep observation with estimated MFP (so even if go is avaialble last year, if not enough years to estiamte MFP, its lots in skinny version)
merge 1:1 fyear firmid using "$data\addit_sales_grow.dta", keepusing( dlgo) keep(master match) 
drop _merge

** If need extra age data not used can bring in here. Taken from birthdate dataset
merge m:1 tsid id using "$outputs\birth.dta", keep(master match)
drop _merge

** Rajan ad zingales financial dependence a intangibles metrics - included as csv in supplementary mateiral
merge m:1 industry using "P:\mANIP DATA\rajan_zing.dta", keep(master match)
drop _merge

** PIM measures
merge 1:1 fyear id using "$outputs\pim.dta", keepusing(rk_pim* inv r_inv) keep(master match)
drop _merge

** Industry mark-up measures
** Levels
merge m:1 fyear industry using "$outputs\sum_k_`k_meas'_instr_`k_instr'_out_`output'_int_`input'_trm_`trim'_`ind'_`file'_l`cuts'.dta", keepusing(mi_`reg' wig_`reg')
drop _merge
* changes
merge m:1  industry using "$outputs\industry_mu_change", keep(master match) nogen
compress




********************* 03 Varaible constrcution


  egen ind_yr = group(industry fyear)


*****  Prodcutivity measrures

**** Labour
 
 *va
 g l_prod_fte_va = YVA/fte
 g ll_prod_fte_va = ln(l_prod_fte_va)
 * gross output
 g ll_prod_fte = ln(l_prod_fte)
  	

** Demean
bysort industry fyear: egen prod_va_mean = mean(l_prod_fte_va)
bysort industry fyear: egen prod_mean = mean(l_prod_fte)

g dev_prod_lva = log(l_prod_fte_va/prod_va_mean)
g dev_prod_l = log(l_prod_fte/prod_mean)

** Get extremes
egen dev_prod_lva1 = pctile(dev_prod_lva), p(1)
egen dev_prod_lva99 = pctile(dev_prod_lva), p(99)

egen dev_prod_l1 = pctile(dev_prod_l), p(1)
egen dev_prod_l99 = pctile(dev_prod_l), p(99)
	

**** TFP
g exp_prod_DLW_tl_w_y=exp($prod)
bysort industry fyear: egen prod_MFP_mean = mean(exp_prod_DLW_tl_w_y)

* Define deviations and quartiles
g dev_prod_MFP = log(exp_prod_DLW_tl_w_y/prod_MFP_mean)
bysort industry fyear: egen dev_prod_MFP25 =  pctile(dev_prod_MFP), p(25)
bysort industry fyear: egen dev_prod_MFP50 =  pctile(dev_prod_MFP), p(50)
bysort industry fyear: egen dev_prod_MFP75 =  pctile(dev_prod_MFP), p(75)
g q_dev_prod_MFP = 1 if dev_prod_MFP<dev_prod_MFP25 & dev_prod_MFP!=.
replace q_dev_prod_MFP=2 if dev_prod_MFP>dev_prod_MFP25 & dev_prod_MFP<dev_prod_MFP50 & dev_prod_MFP!=.
replace q_dev_prod_MFP=3 if dev_prod_MFP>dev_prod_MFP50 & dev_prod_MFP<dev_prod_MFP75 & dev_prod_MFP!=.
replace q_dev_prod_MFP=4 if dev_prod_MFP>dev_prod_MFP75 & dev_prod_MFP!=.
* Outliers
egen dev_prod_MFP1 = pctile(dev_prod_MFP), p(1)
egen dev_prod_MFP99 = pctile(dev_prod_MFP), p(99)

**** Capital prod
g va_k = YVA/K
g lva_k = log(va_k)
* deviation
bysort industry fyear: egen prod_kva_mean = mean(va_k)
g dev_prod_kva = log(va_k/prod_kva_mean)
* outliers
egen dev_prod_kva1 = pctile(dev_prod_kva), p(1)
egen dev_prod_kva99 = pctile(dev_prod_kva), p(99)

  
  
******** Construct LHS variables	
compress
xtset firmid fyear

** Bounded FTE growth	
 g dfte = (fte-l.fte)/(0.5*fte+0.5*l.fte) if fte>0 & fte !=. & l.fte>0 & l.fte !=.
 *replace dfte = 2 if l.fte ==. & fte > 0 & fyear !=2002 // check if want to have entry or exit in here?
 *replace dfte = 2 if l.fte ==0 & fte > 0
 
 ** Bounded PIM growth
 ds rk_pim*
 local varlist `r(varlist)'
 foreach var in `varlist' {
	g d_`var'_bound = (`var'-l.`var')/(0.5*`var'+0.5*l.`var') if `var'>0 & `var'!=. & l.`var'>0 & l.`var'!=.	
 }
	

** Bounded capital stock growth - note timing  is equated to Decker t al, where catial at end previous year is relevant metric
g d_k_bound = (K-l.K)/(0.5*K+0.5*l.K) if K>0 & K!=. & l.K>0 & l.K!=.


** Gros output
g K_contemp = k_bit/k_defl // note here using K  at end current year
g i_k = (r_inv/l.K_contemp)/1000000 // r_inv not scalled so scaling to $m
replace i_k = 2.6 if i_k>2.6 & i_k!=. //windsorise a bit above the 99th percentile
g l_i_k = l.i_k

g i_y = (inv/Y_nom)/1000000
replace i_y=1 if i_y>1 &  i_y!=.


************** Size and age
g size = 1 if fte<5
replace size = 2 if (fte>=5 & fte<20)
replace size = 3 if (fte>=20 & fte<200)
replace size = 4 if (fte>=200 & fte<500)
replace size = 5 if (fte>=500 & fte!=.)

g age = tsid-yob
tab age if tsid == 2005
g young = (age<=5) // note issues with age with lots of firms 'born' in 2001 due to introduction GST. So young varaible baised pre 2005
replace young = . if age==.



** Exit identifier
egen max_year = max(fyear)
bysort firmid (fyear): g exit = (_n == _N & fyear < max_year)


********************************* Balance sheet metrics
g gearing =totlasst/totlliab*1000000 // Defined as assets/ liab. Scalling as total assets had scaled down by $m
replace gearing =  10 if gearing>10 & gearing!=. // topcode at 10
replace gearing = 10 if totlasst>0 & totlliab==0


foreach var in gearing  {
	bysort industry fyear: egen p50_`var' = pctile(`var'), p(50)
	bysort industry fyear: egen p75_`var' = pctile(`var'), p(75)
	bysort industry fyear: egen p25_`var' = pctile(`var'), p(25)
	bysort industry fyear: egen p10_`var' = pctile(`var'), p(10)
	bysort industry fyear: egen p90_`var' = pctile(`var'), p(90)
	
}

** Groupings for high low gearing
g gear_50 = (gearing<p50_gearing)
g gear_25 = (gearing<p25_gearing)
g gear_10 = (gearing<p10_gearing)

*** Cash flow measures
g cf = Y_nom-totlexps+deprexps
g cf_p = (cf>0) // cashflow positive indicator
replace cf_p = . if cf==.

compress

******* Deamean some of the industry or state metrics so focus on changes
** demean markups
bysort industry: egen mean_mi_$reg = mean(mi_$reg)
g demean_mi_$reg = mi_$reg - mean_mi_$reg

bysort industry: egen mean_wig_$reg = mean(wig_$reg)
g demean_wig_$reg = wig_$reg - mean_wig_$reg


** Unemployment
egen mean_unemp_nat = mean(Unemp_nat)
g demand_unemp_nat = Unemp_nat - mean_unemp_nat

bysort State: egen mean_unemp_state = mean(Unemp_state)
g demean_unemp_state = Unemp_state - mean_unemp_state


******** Period varaibles

***** splitting sample into 3 periods
** Main deifnition
	g period1 = 1 if tsid<2008
	replace period1 = 2 if tsid>=2008 & tsid<2012
	replace period1 = 3 if tsid>=2012

** Alternate test difnition 
	g period2 = 1 if tsid<2008
	replace period2 = 2 if tsid>=2008 & tsid<2011
	replace period2 = 3 if tsid>=2011

** Trend as in Andrews and Hansell. Not preffered as parametric restriction
	g trend = tsid-2006
	g trend2 = trend^2
	
	g trend_05 = tsid-2005
	g trend2_05 = trend_05^2

	
**** Defining digital intensity based on Calvino et al OECD paper
	** Digital intensity
	g intense = 1 if  industry < 1300
replace intense = 2 if industry > 1300 & industry < 1400
replace intense = 3 if industry > 1400 & industry < 1700
replace intense = 2 if industry > 1700 & industry < 2300
replace intense = 4 if industry > 2300 & industry < 2400
replace intense = 3 if industry > 2400 & industry < 2599
replace intense = 2 if industry ==2599 
replace intense = 1 if division == "D" | division == "E" | division == "H" | division == "I"
replace intense = 3 if division == "G" | division == "F" | division == "R" | division == "S"
replace intense = 3 if industry > 2500 & industry < 5800
replace intense = 4 if industry > 5800 & industry < 6200
replace intense = 2 if industry == 6010
replace intense = 4 if industry > 6600 & industry < 6700
replace intense = 1 if industry > 6700 & industry < 6900
replace intense = 4 if industry > 6900 & industry < 7500


save "$outputs\reg_file.dta", replace